// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && !generic
// +build amd64,!generic

#include "textflag.h"

// func xorBytesSSE2(dst, a, b *byte, n int)
TEXT ·xorBytesSSE2(SB), NOSPLIT, $0
    MOVQ  dst+0(FP), BX
    MOVQ  a+8(FP), SI
    MOVQ  b+16(FP), CX
    MOVQ  n+24(FP), DX
    TESTQ $15, DX            // AND 15 & len, if not zero jump to not_aligned.
    JNZ   not_aligned

aligned:
    MOVQ $0, AX // position in slices

loop16b:
    MOVOU (SI)(AX*1), X0   // XOR 16byte forwards.
    MOVOU (CX)(AX*1), X1
    PXOR  X1, X0
    MOVOU X0, (BX)(AX*1)
    ADDQ  $16, AX
    CMPQ  DX, AX
    JNE   loop16b
    RET

loop_1b:
    SUBQ  $1, DX           // XOR 1byte backwards.
    MOVB  (SI)(DX*1), DI
    MOVB  (CX)(DX*1), AX
    XORB  AX, DI
    MOVB  DI, (BX)(DX*1)
    TESTQ $7, DX           // AND 7 & len, if not zero jump to loop_1b.
    JNZ   loop_1b
    CMPQ  DX, $0           // if len is 0, ret.
    JE    ret
    TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
    JZ    aligned

not_aligned:
    TESTQ $7, DX           // AND $7 & len, if not zero jump to loop_1b.
    JNE   loop_1b
    SUBQ  $8, DX           // XOR 8bytes backwards.
    MOVQ  (SI)(DX*1), DI
    MOVQ  (CX)(DX*1), AX
    XORQ  AX, DI
    MOVQ  DI, (BX)(DX*1)
    CMPQ  DX, $16          // if len is greater or equal 16 here, it must be aligned.
    JGE   aligned

ret:
    RET
